OperatorNormalizer.java example

Explorer

MathMLCan-master
- src
  - main
    - java
      - cz
        muni
        fi
        mir
        mathmlcanonicalization
        ConfigError.java
        ConfigException.java
        MathMLCanonicalizer.java
        MathMLCanonicalizerCommandLineTool.java
        Settings.java
        modules
        AbstractModule.java
        DOMModule.java
        ElementMinimizer.java
        FunctionNormalizer.java
        MfencedReplacer.java
        Module.java
        ModuleException.java
        MrowNormalizer.java
        OperatorNormalizer.java
        ScriptNormalizer.java
        StreamModule.java
        UnaryOperatorRemover.java
        utils
        DTDManipulator.java
  - test
    - java
      - cz
        muni
        fi
        mir
        mathmlcanonicalization
        MathMLCanonicalizerTest.java
        modules
        AbstractModuleTest.java
        ElementMinimizerTest.java
        FunctionNormalizerTest.java
        MfencedReplacerTest.java
        ModuleTestResources.java
        MrowNormalizerTest.java
        OperatorNormalizerTest.java
        ScriptNormalizerTest.java
        UnaryOperatorRemoverTest.java
        utils
        DTDManipulatorTest.java

/**
 * Copyright 2013 MIR@MU Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package cz.muni.fi.mir.mathmlcanonicalization.modules;

import static cz.muni.fi.mir.mathmlcanonicalization.modules.AbstractModule.MATHMLNS;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jdom2.Content;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.Text;
import org.jdom2.filter.ContentFilter;
import org.jdom2.filter.ElementFilter;

/**
 * Normalize the way to express an function applied to arguments in MathML.
 * <div class="simpleTagLabel">Input</div> Well-formed MathML, not processed by
 * MrowMinimizer yet
 * <div class="simpleTagLabel">Output</div> The original code with:
 * <ul>
 * <li>normalized Unicode symbols</li>
 * <li>unified operators</li>
 * <li>no redundant operators</li>
 * </ul>
 *
 * @author David Formanek
 */
public class OperatorNormalizer extends AbstractModule implements DOMModule {

    private static final Logger LOGGER = Logger.getLogger(OperatorNormalizer.class.getName());
    // properties key names
    private static final String REMOVE_EMPTY_OPERATORS = "removeempty";
    private static final String OPERATORS_TO_REMOVE = "removeoperators";
    private static final String OPERATOR_REPLACEMENTS = "replaceoperators";
    private static final String COLON_REPLACEMENT = "colonreplacement";
    private static final String NORMALIZATION_FORM = "normalizationform";
    private static final String OPERATORS = "operators";
    private static final String IDENTIFIERS = "identifiers";

    public OperatorNormalizer() {
        declareProperty(REMOVE_EMPTY_OPERATORS);
        declareProperty(OPERATORS_TO_REMOVE);
        declareProperty(OPERATOR_REPLACEMENTS);
        declareProperty(COLON_REPLACEMENT);
        declareProperty(NORMALIZATION_FORM);
        declareProperty(OPERATORS);
        declareProperty(IDENTIFIERS);
    }

    @Override
    public void execute(final Document doc) {
        if (doc == null) {
            throw new NullPointerException("doc");
        }
        final Element root = doc.getRootElement();

        // TODO: convert Unicode superscripts (supX entities) to msup etc.

        final String normalizerFormStr = getProperty(NORMALIZATION_FORM);
        if (normalizerFormStr.isEmpty()) {
            LOGGER.fine("Unicode text normalization is switched off");
        } else {
            try {
                Normalizer.Form normalizerForm = Normalizer.Form.valueOf(normalizerFormStr);
                normalizeUnicode(root, normalizerForm);
            } catch (IllegalArgumentException ex) {
                throw new IllegalArgumentException("Invalid configuration value: "
                        + NORMALIZATION_FORM, ex);
            }
        }
        unifyOperators(root);
    }

    /**
     * Converts bad identifiers to operators, removes redundant and replaces
     */
    private void unifyOperators(final Element ancestor) {
        assert ancestor != null;
        final Set<String> toRemove = getPropertySet(OPERATORS_TO_REMOVE);
        final Map<String, String> replaceMap = getPropertyMap(OPERATOR_REPLACEMENTS);
        if (!getProperty(COLON_REPLACEMENT).isEmpty()) {
            replaceMap.put(":", getProperty(COLON_REPLACEMENT));
        }
        final Set<String> operators = getPropertySet(OPERATORS);
        operators.addAll(toRemove);
        operators.addAll(replaceMap.keySet());
        operators.addAll(replaceMap.values());

        replaceIdentifiers(ancestor, operators);

        if (isEnabled(REMOVE_EMPTY_OPERATORS) || !toRemove.isEmpty()) {
            removeSpareOperators(ancestor, toRemove);
        } else {
            LOGGER.fine("No operators set for removal");
        }

        if (replaceMap.isEmpty()) {
            LOGGER.fine("No operators set to replace");
        } else {
            replaceOperators(ancestor, replaceMap);
        }
        
        final Set<String> identifiers = getPropertySet(IDENTIFIERS);
        operatorsToIdentifiers(ancestor, identifiers);
    }

    private void normalizeUnicode(final Element ancestor, final Normalizer.Form form) {
        assert ancestor != null && form != null;
        final List<Text> texts = new ArrayList<Text>();
        final ContentFilter textFilter = new ContentFilter(ContentFilter.TEXT);
        for (Content text : ancestor.getContent(textFilter)) {
            texts.add((Text) text);
        }
        for (Element element : ancestor.getDescendants(new ElementFilter())) {
            for (Content text : element.getContent(textFilter)) {
                texts.add((Text) text);
            }
        }
        for (Text text : texts) {
            if (Normalizer.isNormalized(text.getText(), form)) {
                continue;
            }
            final String normalizedString = Normalizer.normalize(text.getText(), form);
            LOGGER.log(Level.FINE, "Text ''{0}'' normalized to ''{1}''",
                    new Object[]{text.getText(), normalizedString});
            text.setText(normalizedString);
            assert Normalizer.isNormalized(text.getText(), form);
        }
    }

    private void removeSpareOperators(final Element element, final Collection<String> spareOperators) {
        assert element != null && spareOperators != null && !spareOperators.isEmpty();
        final List<Element> children = element.getChildren();
        for (int i = 0; i < children.size(); i++) {
            final Element actual = children.get(i); // actual element
            if (isOperator(actual)) {
            	//Keep special case where asterisk is by itself in a subscript
                String parent = actual.getParentElement().getName();
                if (isSpareOperator(actual, spareOperators) && !(parent.equals("msub"))
                        && !(parent.equals("msubsup") && !(parent.equals("msup")))) {
                    actual.detach();
                    i--; // move iterator back after detaching so it points to next element
                    LOGGER.log(Level.FINE, "Operator {0} removed", actual);
                }
            } else {
                removeSpareOperators(actual, spareOperators);
            }
        }
    }

    private boolean isSpareOperator(final Element operator, final Collection<String> spareOperators) {
        assert operator != null && spareOperators != null && isOperator(operator);
        return (isEnabled(REMOVE_EMPTY_OPERATORS) && operator.getText().isEmpty())
                || (spareOperators.contains(operator.getTextTrim()));
    }

    private void replaceOperators(final Element element, final Map<String, String> replacements) {
        assert element != null && replacements != null;
        List<Element> operatorsToReplace = new ArrayList<Element>();
        for (Element operator : element.getDescendants(new ElementFilter(OPERATOR, MATHMLNS))) {
            if (replacements.containsKey(operator.getTextTrim())) {
                operatorsToReplace.add(operator);
            }
        }
        for (Element operator : operatorsToReplace) {
            final String oldOperator = operator.getTextTrim();
            final String newOperator = replacements.get(oldOperator);
            operator.setText(newOperator);
            LOGGER.log(Level.FINE, "Operator ''{0}'' was replaced by ''{1}''",
                    new Object[]{oldOperator, newOperator});
        }
    }

    private void replaceIdentifiers(final Element ancestor, final Set<String> operators) {
        assert ancestor != null && operators != null;
        final List<Element> toReplace = new ArrayList<Element>();
        for (Element element : ancestor.getDescendants(new ElementFilter(IDENTIFIER, MATHMLNS))) {
            // TODO: control whole ranges of symbols rather than listed ones
            if (operators.contains(element.getTextTrim())) {
                toReplace.add(element);
            }
        }
        for (Element element : toReplace) {
            LOGGER.log(Level.FINE, "Creating an operator from {0}", element.getText());
            replaceElement(element, OPERATOR);
        }
    }

    private void operatorsToIdentifiers(final Element ancestor, final Set<String> identifiers) {
        assert ancestor != null && identifiers != null;
        final List<Element> toReplace = new ArrayList<Element>();
        for (Element element : ancestor.getDescendants(new ElementFilter(OPERATOR, MATHMLNS))) {
            if (identifiers.contains(element.getTextTrim())) {
                toReplace.add(element);
            }
        }
        for (Element element : toReplace) {
            LOGGER.log(Level.FINE, "Creating an identifier from {0}", element.getText());
            replaceElement(element, IDENTIFIER);
        }
    }
    
    private Map<String, String> getPropertyMap(final String property) {
        assert property != null && isProperty(property);
        final Map<String, String> propertyMap = new HashMap<String, String>();
        final String[] mappings = getProperty(property).split(" ");
        for (String mapping : mappings) {
            final String[] mappingPair = mapping.split(":", 2);
            if (mappingPair.length != 2) {
                throw new IllegalArgumentException("property has wrong format");
            }
            propertyMap.put(mappingPair[0], mappingPair[1]);
        }
        return propertyMap;
    }
}